library(ggplot2)
setwd("/Users/ab6415/Documents/Work_laptop_stuff/Bioinformatics/Analysis/Beltran_et_al_2020_NEE/03_04_Figures1-2/epimutation_clustering/")
epimut_data<-read.table("../../03_04_Figures1-2/k-means_segmentation_epimutable_genes.txt")
epimut_genes_noiso<-read.table("epimutable_genes.noiso.txt"); epimut_genes_noiso<-epimut_genes_noiso$V1
gene_order<-read.table("cel_geneorder_numbered.txt"); colnames(gene_order)<-c("chr","cosmidID","order_pos")
gene_positions<-read.table("cel_genepos_and_cosmidID.bed"); colnames(gene_positions)<-c("chr","start","end","cosmidID")
ttg_normcounts<-read.table("../../02_Normalised_counts/22G_DEseqnorm_counts_averaged.txt")
gene_order_and_positions<-merge(gene_order,gene_positions[,c("start","end","cosmidID")],by="cosmidID")
gene_order_and_positions<-gene_order_and_positions[-which(duplicated(gene_order_and_positions$cosmidID)),]
gene_order_and_positions<-gene_order_and_positions[order(gene_order_and_positions$order_pos),]
gene_order_and_positions$meanpos<-(gene_order_and_positions$start+gene_order_and_positions$end)/2
iso_to_noiso<-data.frame(iso=rownames(epimut_data),noiso=epimut_genes_noiso)
order_distributions<-function(gene_set){
gene_order<-c(diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="I"),"order_pos"]),
diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="II"),"order_pos"]),
diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="III"),"order_pos"]),
diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="IV"),"order_pos"]),
diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="V"),"order_pos"]),
diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="X"),"order_pos"]))
return(gene_order)
}
distance_distributions<-function(gene_set){
distances<-c(diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="I"),"meanpos"]),
diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="II"),"meanpos"]),
diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="III"),"meanpos"]),
diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="IV"),"meanpos"]),
diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="V"),"meanpos"]),
diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="X"),"meanpos"]))
return(distances)
}
Here I calculate the number of genes separating epimutable genes in the linear genome - neighbouring genes are at a distance of 1, etc. I see a clear excess of neighbouring genes compared to randomly selected genes - in support of clustering.
hist(order_distributions(epimut_genes_noiso),breaks=seq(1000))
length(which(order_distributions(epimut_genes_noiso)==1))
## [1] 30
median(distance_distributions(epimut_genes_noiso))
## [1] 148452.2
length(order_distributions(epimut_genes_noiso))
## [1] 390
length(which(epimut_genes_noiso %in% gene_order_and_positions$cosmidID))
## [1] 396
random_sets_numneighbors<-rep(0,10000)
random_sets_distances<-rep(0,10000)
for (i in seq(10000)){
rand_set<-sample(gene_order_and_positions$cosmidID,size = 396,replace = FALSE)
random_sets_numneighbors[i]<-length(which(order_distributions(rand_set)==1))
random_sets_distances[i]<-median(distance_distributions(rand_set))
}
order_data<-data.frame(distance=c(order_distributions(epimut_genes_noiso),order_distributions(rand_set)),
data=c(rep("real",length(order_distributions(epimut_genes_noiso))),rep("simulated",length(order_distributions(rand_set)))))
ggplot(order_data)+geom_bar(aes(x=distance,fill=data),position ="dodge")+coord_cartesian(xlim=c(0,100))
library(ggplot2)
ggplot(data.frame(random_sets_numneighbors))+geom_bar(aes(x=random_sets_numneighbors))+
coord_cartesian(xlim=c(-1,32))+
geom_vline(xintercept = 30,col="red",linetype="dashed")+
xlab("number of neighbouring epimutable gene pairs")+
theme_classic()
ggsave("epimutable_gene_pairs_observed_vs_1e4randomsets.pdf")
## Saving 7 x 5 in image
We can see this type of effect also when considering the genomic distances (bp) between epimutable genes.
ggplot(data.frame(random_sets_distances))+geom_histogram(aes(x=random_sets_distances))+
coord_cartesian(xlim=c(100000,200000))+
geom_vline(xintercept = median(distance_distributions(epimut_genes_noiso)),col="red",linetype="dashed")+
xlab("median distance between consecutive epimutable genes")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#distance distribution from real data
ggplot(data.frame(distance_distributions(epimut_genes_noiso)))+geom_histogram(aes(x=distance_distributions.epimut_genes_noiso.),breaks=seq(0,2.6e6,by = 1.3e4))+
coord_cartesian(xlim=c(0,2.6e6),ylim=c(0,50))
#distance distribution from random set
ggplot(data.frame(distance_distributions(rand_set)))+geom_histogram(aes(x=distance_distributions.rand_set.),breaks=seq(0,2.6e6,by = 1.3e4))+
coord_cartesian(xlim=c(0,2.6e6),ylim=c(0,50))
We see a clear enrichment of neighbouring epimutable genes relative to what would be expected by chance - suggesting that spatial clustering effects exist in the system. Similarly, the median distance between neighbouring epimutable genes is smaller than would be expected by chance, and comparing the distance distributions between epimutable genes and genes selected randomly shows that epimutable genes are enriched in the lower range of distances.
So with this we can conclude than genes subjected to epimutation tend to cluster more than expected - now the question is, do they epimutate in a coordinated manner?
To test this, I will select neighbouring epimutable genes and do the following test:
Also a Pearson correlation test was applied to the count data of pairs of genes across samples –> quantify number of genes with significant r>0.75 correlations
#with the full dataset
epimut_genes_noiso
## [1] 3R5.1 AC3.5 B0001.5 B0035.7 B0261.8
## [6] B0280.15 B0284.2 B0285.7 B0348.5 B0350.2
## [11] B0416.6 B0454.9 B0457.4 B0511.11 B0511.3
## [16] B0511.4 B0511.5 B0513.4 B0554.7 C01B7.6
## [21] C01B9.1 C02F5.10 C03D6.1 C03E10.3 C03G6.12
## [26] C04F6.1 C05D11.13 C05D2.8 C06A1.3 C06A5.12
## [31] C06B8.7 C06E4.5 C06H2.4 C07G1.6 C07G1.7
## [36] C07G3.9 C08B11.4 C08E3.2 C08E3.6 C08E8.7
## [41] C08F11.7 C09G5.7 C13B9.1 C14F5.3 C15B12.1
## [46] C16C10.11 C16C8.20 C16C8.21 C17F4.7 C18C4.17
## [51] C18D4.2 C18D4.6 C24H10.4 C24H12.6 C25D7.15
## [56] C26C6.3 C27D8.1 C27D8.2 C27D8.3 C27H5.2
## [61] C28A5.4 C28A5.6 C28C12.1 C28F5.1 C30F12.5
## [66] C30G12.2 C30G12.6 C30G7.3 C34C12.6 C34C12.9
## [71] C34D4.1 C35B1.5 C35C5.1 C36B1.9 C36B7.6
## [76] C36C9.3 C36F7.1 C36F7.5 C37A5.9 C38D9.1
## [81] C38D9.3 C38H2.3 C39B5.3 C40A11.7 C41C4.1
## [86] C42D4.19 C43D7.10 C43D7.11 C44B7.11 C44H9.4
## [91] C45E1.1 C45G7.13 C46A5.8 C46F11.6 C46G7.3
## [96] C47B2.8 C48B6.9 C48D1.6 C49F5.1 C49G7.11
## [101] C53C7.5 C54E10.1 C55C3.3 C55C3.7 C56G7.3
## [106] D1005.3 D1037.3 D1054.5 D2024.1 D2024.18
## [111] D2085.1 F01D4.2 F01F1.3 F07A11.1 F07B7.12
## [116] F07D10.1 F07G6.7 F07H5.5 F08A8.2 F08F8.3
## [121] F08G12.4 F08G5.3 F09B12.3 F11A5.18 F13B12.7
## [126] F13H8.7 F13H8.8 F14F4.3 F16B4.6 F17B5.1
## [131] F19H6.5 F20D1.1 F21D5.4 F21D9.7 F22B7.9
## [136] F22F1.2 F22H10.10 F23A7.7 F23B12.4 F23C8.8
## [141] F25G6.1 F26D10.9 F28H7.3 F31C3.11 F31E9.6
## [146] F32A7.6 F32A7.7 F32D8.1 F33D11.8 F33E2.3
## [151] F33H12.1 F35C5.6 F35H10.5 F36A2.3 F36G9.4
## [156] F36H5.11 F37B4.2 F38A1.5 F39C12.1 F39D8.7
## [161] F39E9.6 F39G3.1 F40A3.5 F40D4.16 F40D4.17
## [166] F42C5.6 F42F12.3 F42G4.6 F42G4.7 F42G8.11
## [171] F42G8.4 F42G8.5 F44G3.6 F45C12.8 F45H10.1
## [176] F46C3.1 F46F5.3 F46F5.5 F47A4.2 F47A4.5
## [181] F47B8.6 F47B8.7 F47B8.8 F47F6.1 F48E3.9
## [186] F49B2.3 F49C12.15 F49C5.4 F49E11.2 F49E7.1
## [191] F52C9.1 F53A3.1 F53B6.2 F53B6.6 F53B6.7
## [196] F54C9.4 F54D10.2 F54F7.6 F54F7.7 F55A4.4
## [201] F55C9.13 F55C9.14 F55C9.7 F55C9.8 F55G1.10
## [206] F55G11.8 F55H12.3 F56B3.2 F56D12.4 F56D2.3
## [211] F56D6.15 F57G4.4 F57G4.8 F58B3.1 F58E1.13
## [216] F58E10.7 F58G6.3 F59A1.9 F59A7.7 F59E12.3
## [221] H05C05.4 H06O01.4 H14A12.5 H14E04.3 H19M22.2
## [226] K01D12.1 K02C4.4 K06C4.17 K07E8.10 K07H8.6
## [231] K08C9.7 K08D12.4 K08F8.6 K09A9.4 K09E3.6
## [236] K09F6.6 K09H9.8 K10B3.10 K10D2.8 K10F12.4
## [241] K10G9.2 K11C4.3 K11D2.1 K11D2.2 K12B6.9
## [246] M01E10.2 M01E11.4 M03A8.2 M04B2.6 M04B2.8
## [251] M04G12.2 R01H10.6 R02F2.2 R04B5.9 R06A10.1
## [256] R06C1.4 R07B1.10 R07E5.4 R07H5.4 R08E3.3
## [261] R102.4 R10E4.3 R10F2.1 R11E3.3 R12B2.1
## [266] R151.1 R186.1 T02B11.7 T02G5.4 T03F6.1
## [271] T04D1.4 T05C3.2 T05C3.6 T05D4.1 T05E11.9
## [276] T05G5.4 T07D4.7 T08B2.4 T10A3.1 T10C6.12
## [281] T10D4.3 T10D4.5 T11G6.7 T12B5.1 T12B5.2
## [286] T12B5.6 T13H5.3 T14G10.8 T14G11.3 T16G12.4
## [291] T16H12.2 T20F10.7 T20F5.5 T20F7.6 T20H4.2
## [296] T22A3.4 T23D8.6 T23F6.1 T23F6.3 T23G11.1
## [301] T23G11.11 T24A11.3 T25C12.3 T25D3.3 T27F2.4
## [306] T28B8.1 T28D6.4 T28F2.1 T28F3.4 T28H10.3
## [311] VW06B3R.1 VY10G11R.1 W01A11.3 W01A11.6 W01A11.7
## [316] W02A2.1 W02B8.3 W02D9.10 W03C9.6 W03D2.5
## [321] W03F11.5 W04A8.2 W04B5.3 W05B2.2 W05F2.6
## [326] W05H9.3 W06A11.4 W09D6.5 Y102A11A.3 Y102A5C.6
## [331] Y102A5C.8 Y105C5A.1269 Y110A7A.20 Y116A8C.18 Y116F11A.1
## [336] Y119D3B.21 Y119D3B.8 Y17D7B.10 Y17D7B.6 Y17D7B.8
## [341] Y17D7C.3 Y17D7C.4 Y18H1A.14 Y20F4.2 Y22F5A.5
## [346] Y26G10.5 Y34F4.5 Y37E11AL.1 Y37E3.5 Y37E3.7
## [351] Y37H2B.1 Y38F2AL.12 Y38F2AR.10 Y39A3CL.2 Y39A3CR.8
## [356] Y39B6A.11 Y39B6A.41 Y39B6A.9 Y39D8B.3 Y39G10AR.17
## [361] Y40A1A.4 Y43F8B.22 Y43F8B.8 Y43F8B.9 Y44E3A.2
## [366] Y45F10D.10 Y46C8AL.8 Y47D7A.16 Y47H10A.1 Y47H9C.10
## [371] Y48G10A.7 Y48G1C.5 Y49A3A.4 Y49F6A.10 Y49F6A.5
## [376] Y50E8A.14 Y53C10A.3 Y53F4B.16 Y53F4B.17 Y53F4B.45
## [381] Y53G8AM.1 Y53H1B.1 Y54E2A.4 Y54F10AM.11 Y54F10BM.15
## [386] Y54G2A.44 Y55F3AM.2 Y57G11C.1130 Y60A3A.24 Y60A3A.27
## [391] Y62E10A.3 Y62E10A.4 Y65B4BL.7 Y67D8A.3 Y68A4A.13
## [396] Y69A2AR.14 Y69A2AR.31 Y71F9AL.5 Y71G12A.3 Y71G12B.11
## [401] Y71H2AL.2 Y73F8A.32 Y75D11A.5 Y75D11A.7 Y76A2B.2
## [406] Y76B12C.4 Y7A5A.21 Y7A5A.22 Y80D3A.1 Y81G3A.4
## [411] Y82E9BL.18 Y82E9BR.4 Y87G2A.2 Y8G1A.2 Y94H6A.12
## [416] Y94H6A.7 Y95B8A.1 Y95B8A.10 ZC123.4 ZC15.1
## [421] ZC190.7 ZC190.8 ZC204.2 ZC317.6 ZC328.5
## [426] ZC416.1 ZC84.3 ZK1098.6 ZK1098.7 ZK1128.7
## [431] ZK1151.1 ZK1236.9 ZK131.5 ZK177.9 ZK228.1
## [436] ZK250.15 ZK355.2 ZK673.4 ZK856.5 ZK909.5
## [441] ZK945.1 ZK973.6
## 442 Levels: 3R5.1 AC3.5 B0001.5 B0035.7 B0261.8 B0280.15 B0284.2 ... ZK973.6
gene_order_and_positions_epimutable<-gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% epimut_genes_noiso),]
gene_order_and_positions_epimutable[sort(unique(c(which(diff(gene_order_and_positions_epimutable$order_pos)==1),
which(diff(gene_order_and_positions_epimutable$order_pos)==1)+1))),]
## cosmidID chr order_pos start end meanpos
## 30148 T23G11.11 I 1803 7701120 7701389 7701254
## 30146 T23G11.1 I 1804 7701364 7702053 7701708
## 1104 B0511.4 I 2708 10644244 10645617 10644930
## 1103 B0511.3 I 2709 10646761 10647777 10647269
## 1092 B0511.11 I 2710 10648605 10654179 10651392
## 23587 K11D2.1 I 3256 12497466 12505010 12501238
## 23589 K11D2.2 I 3257 12505027 12507569 12506298
## 14787 F32A7.6 I 3798 14850053 14855827 14852940
## 14788 F32A7.7 I 3799 14852817 14854607 14853712
## 4178 C16C8.20 II 4945 3446956 3448020 3447488
## 4179 C16C8.21 II 4946 3448205 3448759 3448482
## 5951 C30G12.2 II 6296 7290372 7291882 7291127
## 5953 C30G12.6 II 6297 7292191 7296015 7294103
## 36555 Y53F4B.45 II 8520 15042804 15044011 15043408
## 36511 Y53F4B.16 II 8521 15052084 15055718 15053901
## 36512 Y53F4B.17 II 8522 15058226 15062057 15060142
## 28657 T12B5.2 III 8781 956333 957947 957140
## 28649 T12B5.1 III 8782 962303 963679 962991
## 9580 D2024.1 IV 13989 7247542 7249437 7248490
## 9585 D2024.18 IV 13990 7250457 7254204 7252330
## 2521 C06E4.5 IV 13991 7254989 7255909 7255449
## 16715 F42G8.5 IV 14290 8136324 8139087 8137706
## 16712 F42G8.4 IV 14291 8139218 8143137 8141178
## 2758 C07G1.6 IV 14312 8209157 8210400 8209778
## 2759 C07G1.7 IV 14313 8210953 8211828 8211390
## 5517 C27D8.3 IV 15768 12706713 12710499 12708606
## 5516 C27D8.2 IV 15769 12712764 12714771 12713768
## 5515 C27D8.1 IV 15770 12717830 12719391 12718610
## 30137 T23F6.1 IV 15771 12719959 12720997 12720478
## 37611 Y62E10A.3 IV 15932 13359470 13360285 13359878
## 37612 Y62E10A.4 IV 15933 13365168 13368435 13366802
## 27428 T05C3.6 V 18122 5490369 5492261 5491315
## 27424 T05C3.2 V 18123 5493746 5503010 5498378
## 31261 W01A11.6 V 18482 6502368 6504181 6503274
## 31262 W01A11.7 V 18483 6504537 6505500 6505018
## 39377 ZC190.7 V 19246 8686070 8686864 8686467
## 39378 ZC190.8 V 19247 8686957 8687807 8687382
## 17711 F47B8.6 V 21491 14329068 14330080 14329574
## 17713 F47B8.8 V 21492 14331673 14332667 14332170
## 17712 F47B8.7 V 21493 14333415 14334803 14334109
## 33401 Y17D7B.6 V 22876 18771689 18772639 18772164
## 33402 Y17D7B.8 V 22877 18773060 18774024 18773542
## 19447 F55C9.7 V 22997 19297809 19298749 19298279
## 19441 F55C9.14 V 22998 19299561 19300461 19300011
## 19448 F55C9.8 V 22999 19301075 19302034 19301554
## 19440 F55C9.13 V 23000 19302786 19303728 19303257
## 35142 Y43F8B.9 V 23057 19479202 19480459 19479830
## 35127 Y43F8B.22 V 23058 19482834 19482984 19482909
## 19189 F54F7.6 X 27320 11865382 11866827 11866104
## 19190 F54F7.7 X 27321 11867970 11869360 11868665
## 8329 C49F5.1 X 27356 11965961 11968109 11967035
## 31214 VW06B3R.1 X 27357 11968033 12021209 11994621
#52 genes in total
#16 doublets
#4 triplets
#2 quadruplets
data_for_geomcol<-data.frame(count=c(16,4,2),size=factor(c("pairs","triplets","quadruplets"),levels=c("pairs","triplets","quadruplets")))
ggplot(data_for_geomcol)+geom_col(aes(y=count,x=size))+theme_classic()
ggsave("epimutable_gene_pairs_groupsizes.pdf")
## Saving 7 x 5 in image
#analysis by pairs (30 pairs)
gene1<-gene_order_and_positions_epimutable$cosmidID[which(diff(gene_order_and_positions_epimutable$order_pos)==1)]
gene2<-gene_order_and_positions_epimutable$cosmidID[which(diff(gene_order_and_positions_epimutable$order_pos)==1)+1]
gene1_iso<-c()
for (gene in gene1){
gene1_iso<-c(gene1_iso,as.character(iso_to_noiso[which(iso_to_noiso$noiso==gene),"iso"]))
}
gene2_iso<-c()
for (gene in gene2){
gene2_iso<-c(gene2_iso,as.character(iso_to_noiso[which(iso_to_noiso$noiso==gene),"iso"]))
}
test_coepimutation<-function(gene1,gene2,epimut_data,ttg_normcounts){
states_gene1<-epimut_data[toString(gene1),]
states_gene2<-epimut_data[toString(gene2),]
matching_states<-length(states_gene1)-sum(abs(states_gene1-states_gene2))
counts_gene1<-as.numeric(ttg_normcounts[toString(gene1),])
counts_gene2<-as.numeric(ttg_normcounts[toString(gene2),])
corr<-cor.test(counts_gene1,counts_gene2)
expected_matching_states<-rep(0,10000)
expected_corrs<-rep(0,10000)
for (i in seq(10000)){
random_states_gene1<-epimut_data[toString(gene1),][sample(seq(length(epimut_data[toString(gene1),])),size = length(epimut_data[toString(gene1),]),replace = FALSE)]
random_states_gene2<-epimut_data[toString(gene2),][sample(seq(length(epimut_data[toString(gene2),])),size = length(epimut_data[toString(gene2),]),replace = FALSE)]
expected_matching_states[i]<-(length(random_states_gene1)-sum(abs(random_states_gene1-random_states_gene2)))
random_counts_gene1<-as.numeric(ttg_normcounts[toString(gene1),][sample(seq(length(epimut_data[toString(gene1),])),size = length(epimut_data[toString(gene1),]),replace = FALSE)])
random_counts_gene2<-as.numeric(ttg_normcounts[toString(gene2),][sample(seq(length(epimut_data[toString(gene2),])),size = length(epimut_data[toString(gene2),]),replace = FALSE)])
expected_corrs[i]<-cor(random_counts_gene1,random_counts_gene2)
}
return(c(toString(gene1),length(which(states_gene1==2)),toString(gene2),length(which(states_gene2==2)),matching_states,ncol(epimut_data),(length(which(expected_matching_states>=matching_states))+1)/length(expected_matching_states),as.numeric(corr$estimate),corr$p.value,(length(which(expected_corrs>=corr$estimate))+1)/length(expected_corrs)))
}
#example
test_coepimutation(gene1_iso[1],gene2_iso[1],epimut_data,ttg_normcounts)
## [1] "T23G11.11" "13" "T23G11.1"
## [4] "18" "32" "47"
## [7] "0.044" "0.294758589588205" "0.0442960257556428"
## [10] "0.0307"
test_coepimutation_results<-c()
for (i in seq(length(gene1_iso))){
test_coepimutation_results<-rbind(test_coepimutation_results,test_coepimutation(gene1_iso[i],gene2_iso[i],epimut_data,ttg_normcounts))
}
colnames(test_coepimutation_results)<-c("gene1","gene1_up","gene2","gene2_up","matching states","total samples","sim_pvalue","pearson_r","pearson_pvalue","corrsim_pvalue")
test_coepimutation_results<-data.frame(test_coepimutation_results)
#sim test
test_coepimutation_results$sim_pvalue<-as.numeric(as.character(test_coepimutation_results$sim_pvalue))
test_coepimutation_results$sim_fdr<-p.adjust(test_coepimutation_results$sim_pvalue,method = "fdr")
ggplot(data.frame(test_coepimutation_results))+geom_histogram(aes(x=sim_fdr))+ggtitle("simulation test")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
length(which(test_coepimutation_results$sim_fdr<0.1))
## [1] 13
length(which(test_coepimutation_results$sim_fdr<0.2))
## [1] 17
#pearson test
test_coepimutation_results$pearson_pvalue<-as.numeric(as.character(test_coepimutation_results$pearson_pvalue))
test_coepimutation_results$pearson_fdr<-p.adjust(test_coepimutation_results$pearson_pvalue,method = "fdr")
ggplot(data.frame(test_coepimutation_results))+geom_histogram(aes(x=pearson_fdr))+ggtitle("Pearson correlation test")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
length(which(test_coepimutation_results$pearson_fdr<0.1 & as.numeric(as.character(test_coepimutation_results$pearson_r>0.75))))
## Warning in Ops.factor(test_coepimutation_results$pearson_r, 0.75): '>' not
## meaningful for factors
## [1] 0
#correlation sim test
test_coepimutation_results$corrsim_pvalue<-as.numeric(as.character(test_coepimutation_results$corrsim_pvalue))
test_coepimutation_results$corrsim_fdr<-p.adjust(test_coepimutation_results$corrsim_pvalue,method = "fdr")
ggplot(data.frame(test_coepimutation_results))+geom_histogram(aes(x=corrsim_fdr))+ggtitle("correlation simulation test")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
length(which(test_coepimutation_results$corrsim_fdr<0.1))
## [1] 25
length(which(test_coepimutation_results$corrsim_fdr<0.2))
## [1] 25
test_coepimutation_results
## gene1 gene1_up gene2 gene2_up matching.states total.samples
## 1 T23G11.11 13 T23G11.1 18 32 47
## 2 B0511.4 3 B0511.3 3 45 47
## 3 B0511.3 3 B0511.11 1 43 47
## 4 K11D2.1b 21 K11D2.2.1 21 31 47
## 5 F32A7.6 1 F32A7.7 6 42 47
## 6 C16C8.20 8 C16C8.21 11 44 47
## 7 C30G12.2 1 C30G12.6a 1 45 47
## 8 Y53F4B.45 9 Y53F4B.16 15 23 47
## 9 Y53F4B.16 15 Y53F4B.17 16 38 47
## 10 T12B5.2 4 T12B5.1 8 43 47
## 11 D2024.1 1 D2024.18 12 36 47
## 12 D2024.18 12 C06E4.5 12 43 47
## 13 F42G8.5 6 F42G8.4b.1 20 25 47
## 14 C07G1.6 1 C07G1.7 2 46 47
## 15 C27D8.3b 29 C27D8.2 2 16 47
## 16 C27D8.2 2 C27D8.1 1 46 47
## 17 C27D8.1 1 T23F6.1 1 47 47
## 18 Y62E10A.3 11 Y62E10A.4a 14 38 47
## 19 T05C3.6a 3 T05C3.2 4 44 47
## 20 W01A11.6 5 W01A11.7.2 1 43 47
## 21 ZC190.7 1 ZC190.8 3 45 47
## 22 F47B8.6 1 F47B8.8 1 47 47
## 23 F47B8.8 1 F47B8.7 1 47 47
## 24 Y17D7B.6 4 Y17D7B.8 1 44 47
## 25 F55C9.7 6 F55C9.14 1 42 47
## 26 F55C9.14 1 F55C9.8 18 30 47
## 27 F55C9.8 18 F55C9.13 1 30 47
## 28 Y43F8B.9 1 Y43F8B.22 10 38 47
## 29 F54F7.6.2 16 F54F7.7 1 30 47
## 30 C49F5.1.3 2 VW06B3R.1b 9 36 47
## sim_pvalue pearson_r pearson_pvalue corrsim_pvalue sim_fdr
## 1 0.0498 0.294758589588205 4.429603e-02 0.0285 0.10671429
## 2 0.0096 0.837538766022239 2.154051e-13 0.0001 0.04800000
## 3 1.0001 0.182310208223178 2.200067e-01 0.0800 1.00000000
## 4 0.0335 0.394822541196271 6.024619e-03 0.0035 0.09136364
## 5 0.1328 0.349537422325746 1.602753e-02 0.0311 0.20968421
## 6 0.0001 0.921168187789753 4.602067e-20 0.0001 0.00100000
## 7 1.0001 0.0220483474951594 8.830504e-01 0.3027 1.00000000
## 8 1.0001 -0.208118945975963 1.603794e-01 0.9278 1.00000000
## 9 0.0001 0.780757778649914 9.639870e-11 0.0001 0.00100000
## 10 0.0007 0.798455033274307 1.774885e-11 0.0001 0.00480000
## 11 0.2647 0.908508458714786 1.146427e-18 0.0001 0.37814286
## 12 0.0001 0.833955391221065 3.381609e-13 0.0001 0.00100000
## 13 0.8229 0.214568326315581 1.475260e-01 0.0809 1.00000000
## 14 0.0391 0.92034233274529 5.767113e-20 0.0001 0.09346154
## 15 1.0001 -0.19817663194178 1.817671e-01 0.9099 1.00000000
## 16 0.0405 0.867701452971962 2.957836e-15 0.0001 0.09346154
## 17 0.0226 0.888045891427894 8.624323e-17 0.0009 0.06780000
## 18 0.0008 0.442047628990477 1.865218e-03 0.0027 0.00480000
## 19 0.0164 0.812801021775346 3.962912e-12 0.0001 0.06780000
## 20 0.1103 0.17607982125463 2.364470e-01 0.0321 0.19464706
## 21 0.0649 0.978724970260119 1.341549e-32 0.0001 0.12980000
## 22 0.0219 0.99999438528493 5.080949e-113 0.0002 0.06780000
## 23 0.0220 0.999989508092554 6.535434e-107 0.0206 0.06780000
## 24 0.0874 0.945280496299038 1.609033e-23 0.0001 0.16387500
## 25 0.1233 0.646397463483123 9.204028e-07 0.0001 0.20550000
## 26 0.3778 0.665442199906775 3.340901e-07 0.0025 0.50269565
## 27 0.3854 0.689153001257861 8.506693e-08 0.0041 0.50269565
## 28 0.2149 0.264792562954239 7.206587e-02 0.0598 0.32235000
## 29 1.0001 -0.244498023159829 9.766050e-02 0.9638 1.00000000
## 30 1.0001 -0.133434148988213 3.712399e-01 0.8512 1.00000000
## pearson_fdr corrsim_fdr
## 1 6.328004e-02 0.0427500000
## 2 6.462154e-13 0.0002500000
## 3 2.444519e-01 0.0970800000
## 4 9.512556e-03 0.0061764706
## 5 2.404129e-02 0.0437727273
## 6 2.761240e-19 0.0002500000
## 7 8.830504e-01 0.3492692308
## 8 1.924553e-01 0.9597931034
## 9 2.065686e-10 0.0002500000
## 10 4.095889e-11 0.0002500000
## 11 4.913257e-18 0.0002500000
## 12 9.222570e-13 0.0002500000
## 13 1.844075e-01 0.0970800000
## 14 2.883557e-19 0.0002500000
## 15 2.097312e-01 0.9597931034
## 16 9.859455e-15 0.0002500000
## 17 3.234121e-16 0.0019285714
## 18 3.108697e-03 0.0050625000
## 19 9.907280e-12 0.0002500000
## 20 2.533361e-01 0.0437727273
## 21 1.341549e-31 0.0002500000
## 22 1.524285e-111 0.0004615385
## 23 9.803150e-106 0.0325263158
## 24 1.206775e-22 0.0002500000
## 25 1.624240e-06 0.0002500000
## 26 6.264189e-07 0.0050000000
## 27 1.701339e-07 0.0068333333
## 28 9.827165e-02 0.0780000000
## 29 1.273833e-01 0.9638000000
## 30 3.840413e-01 0.9457777778
#with data from gen 25-100 only
epimut_data_gens25_100<-epimut_data[,25:47]
ttg_normcounts_gens25_100<-ttg_normcounts[,25:47]
test_coepimutation_results_gens25_100<-c()
for (i in seq(length(gene1_iso))){
test_coepimutation_results_gens25_100<-rbind(test_coepimutation_results_gens25_100,test_coepimutation(gene1_iso[i],gene2_iso[i],epimut_data_gens25_100,ttg_normcounts_gens25_100))
}
colnames(test_coepimutation_results_gens25_100)<-c("gene1","gene1_up","gene2","gene2_up","matching states","total samples","sim_pvalue","pearson_r","pearson_pvalue","corrsim_pvalue")
test_coepimutation_results_gens25_100<-data.frame(test_coepimutation_results_gens25_100)
#sim test
test_coepimutation_results_gens25_100$sim_pvalue<-as.numeric(as.character(test_coepimutation_results_gens25_100$sim_pvalue))
test_coepimutation_results_gens25_100$sim_fdr<-p.adjust(test_coepimutation_results_gens25_100$sim_pvalue,method = "fdr")
ggplot(data.frame(test_coepimutation_results_gens25_100))+geom_histogram(aes(x=sim_fdr))+ggtitle("simulation test")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
length(which(test_coepimutation_results_gens25_100$sim_fdr<0.1))
## [1] 4
length(which(test_coepimutation_results_gens25_100$sim_fdr<0.2))
## [1] 9
#pearson test
test_coepimutation_results_gens25_100$pearson_pvalue<-as.numeric(as.character(test_coepimutation_results_gens25_100$pearson_pvalue))
test_coepimutation_results_gens25_100$pearson_fdr<-p.adjust(test_coepimutation_results_gens25_100$pearson_pvalue,method = "fdr")
ggplot(data.frame(test_coepimutation_results_gens25_100))+geom_histogram(aes(x=pearson_fdr))+ggtitle("Pearson correlation test")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
length(which(test_coepimutation_results_gens25_100$pearson_fdr<0.1 & as.numeric(as.character(test_coepimutation_results_gens25_100$pearson_r>0.75))))
## Warning in Ops.factor(test_coepimutation_results_gens25_100$pearson_r, 0.75):
## '>' not meaningful for factors
## [1] 0
#correlation sim test
test_coepimutation_results_gens25_100$corrsim_pvalue<-as.numeric(as.character(test_coepimutation_results_gens25_100$corrsim_pvalue))
test_coepimutation_results_gens25_100$corrsim_fdr<-p.adjust(test_coepimutation_results_gens25_100$corrsim_pvalue,method = "fdr")
ggplot(data.frame(test_coepimutation_results_gens25_100))+geom_histogram(aes(x=corrsim_fdr))+ggtitle("correlation simulation test")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
length(which(test_coepimutation_results_gens25_100$corrsim_fdr<0.1))
## [1] 22
length(which(test_coepimutation_results_gens25_100$corrsim_fdr<0.2))
## [1] 24
test_coepimutation_results_gens25_100
## gene1 gene1_up gene2 gene2_up matching.states total.samples
## 1 T23G11.11 11 T23G11.1 11 17 23
## 2 B0511.4 2 B0511.3 1 22 23
## 3 B0511.3 1 B0511.11 1 21 23
## 4 K11D2.1b 14 K11D2.2.1 7 14 23
## 5 F32A7.6 1 F32A7.7 5 19 23
## 6 C16C8.20 8 C16C8.21 11 20 23
## 7 C30G12.2 1 C30G12.6a 1 21 23
## 8 Y53F4B.45 8 Y53F4B.16 1 14 23
## 9 Y53F4B.16 1 Y53F4B.17 5 19 23
## 10 T12B5.2 4 T12B5.1 7 20 23
## 11 D2024.1 1 D2024.18 3 21 23
## 12 D2024.18 3 C06E4.5 3 23 23
## 13 F42G8.5 2 F42G8.4b.1 19 6 23
## 14 C07G1.6 1 C07G1.7 2 22 23
## 15 C27D8.3b 16 C27D8.2 2 5 23
## 16 C27D8.2 2 C27D8.1 1 22 23
## 17 C27D8.1 1 T23F6.1 1 23 23
## 18 Y62E10A.3 8 Y62E10A.4a 13 18 23
## 19 T05C3.6a 3 T05C3.2 3 21 23
## 20 W01A11.6 4 W01A11.7.2 1 20 23
## 21 ZC190.7 1 ZC190.8 3 21 23
## 22 F47B8.6 1 F47B8.8 1 23 23
## 23 F47B8.8 1 F47B8.7 1 23 23
## 24 Y17D7B.6 2 Y17D7B.8 1 22 23
## 25 F55C9.7 3 F55C9.14 1 21 23
## 26 F55C9.14 1 F55C9.8 8 16 23
## 27 F55C9.8 8 F55C9.13 1 16 23
## 28 Y43F8B.9 1 Y43F8B.22 9 15 23
## 29 F54F7.6.2 6 F54F7.7 1 16 23
## 30 C49F5.1.3 2 VW06B3R.1b 2 19 23
## sim_pvalue pearson_r pearson_pvalue corrsim_pvalue sim_fdr
## 1 0.0306 0.653901326740574 7.137350e-04 0.0007 0.1493333
## 2 0.0870 0.976185163336012 2.033724e-15 0.0001 0.2088462
## 3 1.0001 0.0291055435365434 8.951199e-01 0.1920 1.0000000
## 4 0.1246 0.488398602698691 1.805119e-02 0.0088 0.2329412
## 5 0.2162 0.345692067343827 1.061611e-01 0.0729 0.3243000
## 6 0.0005 0.922617632780601 3.793254e-10 0.0001 0.0075000
## 7 1.0001 -0.0130343903097682 9.529306e-01 0.4116 1.0000000
## 8 1.0001 0.144694828745383 5.100793e-01 0.2561 1.0000000
## 9 0.2147 0.815735686700017 2.100078e-06 0.0001 0.3243000
## 10 0.0044 0.783306203303661 9.880117e-06 0.0002 0.0330000
## 11 0.1299 0.97215343078085 1.032325e-14 0.0001 0.2329412
## 12 0.0003 0.831599459383295 8.790134e-07 0.0001 0.0075000
## 13 0.6790 0.423113986937124 4.425741e-02 0.0309 0.8487500
## 14 0.0905 0.920171483283003 5.201921e-10 0.0001 0.2088462
## 15 1.0001 -0.357614122537788 9.386684e-02 0.9589 1.0000000
## 16 0.0876 0.901500907420391 4.345013e-09 0.0002 0.2088462
## 17 0.0435 0.978700452025632 6.368905e-16 0.0018 0.1493333
## 18 0.0022 0.334402315604938 1.188563e-01 0.0658 0.0220000
## 19 0.0343 0.8696881250749 7.100788e-08 0.0006 0.1493333
## 20 0.1742 0.149010384942001 4.974038e-01 0.0797 0.2903333
## 21 0.1267 0.982024513534572 1.088019e-16 0.0001 0.2329412
## 22 0.0440 0.99999707169656 1.976699e-56 0.0005 0.1493333
## 23 0.0448 0.999992495941199 3.864323e-52 0.0415 0.1493333
## 24 0.0838 0.995851393322125 2.380291e-23 0.0001 0.2088462
## 25 0.1320 0.754377638386002 3.204956e-05 0.0209 0.2329412
## 26 0.3500 0.874561562077928 4.866332e-08 0.0004 0.4830000
## 27 0.3542 0.83844232277318 5.870626e-07 0.0062 0.4830000
## 28 0.3847 0.200588735287752 3.587661e-01 0.1565 0.5017826
## 29 1.0001 -0.32188404806676 1.341828e-01 0.9552 1.0000000
## 30 1.0001 -0.154456695369151 4.816321e-01 0.7940 1.0000000
## pearson_fdr corrsim_fdr
## 1 1.189558e-03 0.001500000
## 2 1.016862e-14 0.000375000
## 3 9.259861e-01 0.230400000
## 4 2.850188e-02 0.015529412
## 5 1.447651e-01 0.099409091
## 6 1.422470e-09 0.000375000
## 7 9.529306e-01 0.457333333
## 8 5.465136e-01 0.295500000
## 9 4.200157e-06 0.000375000
## 10 1.852522e-05 0.000600000
## 11 4.424251e-14 0.000375000
## 12 1.883600e-06 0.000375000
## 13 6.638611e-02 0.048789474
## 14 1.733974e-09 0.000375000
## 15 1.340955e-01 0.958900000
## 16 1.303504e-08 0.000600000
## 17 3.821343e-15 0.003600000
## 18 1.550299e-01 0.094000000
## 19 1.775197e-07 0.001384615
## 20 5.465136e-01 0.103956522
## 21 8.160140e-16 0.000375000
## 22 5.930096e-55 0.001250000
## 23 5.796484e-51 0.062250000
## 24 2.380291e-22 0.000375000
## 25 5.655804e-05 0.034833333
## 26 1.327181e-07 0.001090909
## 27 1.354760e-06 0.011625000
## 28 4.305193e-01 0.195625000
## 29 1.677285e-01 0.958900000
## 30 5.465136e-01 0.850714286
toplot<-test_coepimutation_results_gens25_100[which(as.numeric(as.character(test_coepimutation_results_gens25_100$pearson_r))>0.75),c("gene1","gene2")]
ticks<-colnames(ttg_normcounts)
for (row in seq(nrow(toplot))){
gene1<-as.character(toplot[row,"gene1"]); gene2<-as.character(toplot[row,"gene2"])
print(paste(gene1,gene2,sep=" - "))
pdf(paste(paste(paste("correlated_epimutations_examples/",gene1,sep=""),gene2,sep="_vs_"),"pdf",sep="."))
plot(as.numeric(ttg_normcounts[gene1,25:47]),as.numeric(ttg_normcounts[gene2,25:47]),xlab=paste("22G-RNA normalized counts in ",gene1,sep=""),
ylab=paste("22G-RNA normalized counts in ",gene2,sep=""),pch=16)
dev.off()
plot(as.numeric(ttg_normcounts[gene1,25:47]),as.numeric(ttg_normcounts[gene2,25:47]),xlab=paste("22G-RNA normalized counts in ",gene1,sep=""),
ylab=paste("22G-RNA normalized counts in ",gene2,sep=""),pch=16)
pdf(paste(paste("correlated_epimutations_examples/",gene1,sep=""),"pdf",sep="."))
plot(1:23,c(as.numeric(ttg_normcounts[gene1,36]),as.numeric(ttg_normcounts[gene1,25:35]),as.numeric(ttg_normcounts[gene1,37:47])),main=gene1,
xaxt="n",ylab="normalized 22G-RNA counts",xlab="line",pch=16)
axis(1, at=1:23, labels=ticks[c(36,25:35,37:47)],las=2)
dev.off()
plot(1:23,c(as.numeric(ttg_normcounts[gene1,36]),as.numeric(ttg_normcounts[gene1,25:35]),as.numeric(ttg_normcounts[gene1,37:47])),main=gene1,
xaxt="n",ylab="normalized 22G-RNA counts",xlab="line",pch=16)
axis(1, at=1:23, labels=ticks[c(36,25:35,37:47)],las=2)
pdf(paste(paste("correlated_epimutations_examples/",gene2,sep=""),"pdf",sep="."))
plot(1:23,c(as.numeric(ttg_normcounts[gene2,36]),as.numeric(ttg_normcounts[gene2,25:35]),as.numeric(ttg_normcounts[gene2,37:47])),col="red",main=gene2,
xaxt="n",ylab="normalized 22G-RNA counts",xlab="line",pch=16)
axis(1, at=1:23, labels=ticks[c(36,25:35,37:47)],las=2)
dev.off()
plot(1:23,c(as.numeric(ttg_normcounts[gene2,36]),as.numeric(ttg_normcounts[gene2,25:35]),as.numeric(ttg_normcounts[gene2,37:47])),col="red",main=gene2,
xaxt="n",ylab="normalized 22G-RNA counts",xlab="line",pch=16)
axis(1, at=1:23, labels=ticks[c(36,25:35,37:47)],las=2)
}
## [1] "B0511.4 - B0511.3"
## [1] "C16C8.20 - C16C8.21"
## [1] "Y53F4B.16 - Y53F4B.17"
## [1] "T12B5.2 - T12B5.1"
## [1] "D2024.1 - D2024.18"
## [1] "D2024.18 - C06E4.5"
## [1] "C07G1.6 - C07G1.7"
## [1] "C27D8.2 - C27D8.1"
## [1] "C27D8.1 - T23F6.1"
## [1] "T05C3.6a - T05C3.2"
## [1] "ZC190.7 - ZC190.8"
## [1] "F47B8.6 - F47B8.8"
## [1] "F47B8.8 - F47B8.7"
## [1] "Y17D7B.6 - Y17D7B.8"
## [1] "F55C9.7 - F55C9.14"
## [1] "F55C9.14 - F55C9.8"
## [1] "F55C9.8 - F55C9.13"
I checked whether these pairs tend to be in the same operon, and only two pairs out of 30 were - (F42G8.5-F42G8.4 and W01A11.6-W01A11.7) - and these two pairs have low 22G-RNA correlations across samples (0.17 and 0.21). So there is little evidence of spreading of silencing through operons in this dataset. In fact, most operons are enriched in active chromatin domains that tend to be devoid of epimutations. So perhaps the clustering we see is due to chromatin-based spreading.